#############################################################
# Author: Mike Burnham, mlb6496@psu.edu
# Python: 3.11.5
# OS: Windows 10
#
# Notes: This script reproduces the numbers found in the text
# of section 2.3: Effects of Controlling for Context, as well
# as footnote 2.
##############################################################

import pandas as pd
import logging

logging.basicConfig(level=logging.DEBUG, filename="sect2_3.log", filemode="a+", format="%(asctime)-15s %(levelname)-8s %(message)s")

# import test data
ambig = pd.read_csv('trump_test_data.csv')

# create ambiguous label column
ambig['ambiguous'] = 0
# set value to 1 when human labels disagree
ambig.loc[ambig['coder_label'] != ambig['adjudicated_label'], 'ambiguous'] = 1
# subset data
ambig = ambig[['text', 'target_mention', 'dataset', 'adjudicated_sup', 'ambiguous', 'trump_topic']]

# import and subset labeled data from each classification approach
context = pd.read_csv('./trump_test_in_context.csv')
context = context[['text', 'gpt4_bias', 'gpt3_5_bias', 'mistral_bias']]
nli = pd.read_csv('./trump_test_nli.csv')
nli = nli[['text', 'gL']]
supervised = pd.read_csv('./trump_twitter_supervised.csv')
supervised = supervised[['text', 'polibert', 'roberta', 'bertweet']]

# merge labels from each approach into a single dataframe
ambig = pd.merge(ambig, context, how = 'left', on = 'text')
ambig = pd.merge(ambig, nli, how = 'left', on = 'text')
ambig = pd.merge(ambig, supervised, how = 'left', on = 'text')

logging.info("% Ambiguous documents among those that mention Trump: " + str(ambig.loc[ambig['target_mention'] == 1, 'ambiguous'].mean().round(2)))
logging.info("% Ambiguous documents among those that do not mention Trump: " + str(ambig.loc[ambig['target_mention'] == 0, 'ambiguous'].mean().round(2)))
logging.info("% Ambiguous documents among those classified as related to Trump: " + str(ambig.loc[ambig['trump_topic'] == 1, 'ambiguous'].mean().round(2)))
logging.info("% Ambiguous documents among those classified as not related to Trump: " + str(ambig.loc[ambig['trump_topic'] == 0, 'ambiguous'].mean().round(2)))